library(forecast)
library(fpp2)
## Loading required package: fma
## 
## Attaching package: 'fma'
## The following object is masked from 'package:plyr':
## 
##     ozone
## Loading required package: expsmooth
library(broom)
library(tidyverse)
library(sweep)
library(tseries)
detach("package:dplyr", unload=TRUE)
## Warning: 'dplyr' namespace cannot be unloaded:
##   namespace 'dplyr' is imported by 'timetk', 'tigris', 'broom', 'tidycensus', 'sweep' so cannot be unloaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
## The following object is masked from 'package:fma':
## 
##     pigs
library(e1071)
library(fpp2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
theme_set(theme_minimal())

5.1 The Linear model

Simple linear regression

\[y_t = \beta_0 + \beta_1x_t + \epsilon_t\]

!(image){https://otexts.com/fpp2/fpp_files/figure-html/SLRpop1-1.png}

Example: US consumption expenditure

autoplot(uschange[,c("Consumption","Income")]) +
  labs(x = "% change", y = "Year")

uschange %>%
  as.data.frame() %>%
  ggplot(aes(Income, Consumption)) +
  geom_point() +
  geom_smooth(method = "lm", se = F) +
  labs(x = "Income (quarterly % change)",
       y = "Consumption (quarterly % chnage)")

tslm(Consumption ~ Income, data = uschange)
## 
## Call:
## tslm(formula = Consumption ~ Income, data = uschange)
## 
## Coefficients:
## (Intercept)       Income  
##      0.5451       0.2806

The interpretation of the intercept required that a vlaue of x = o make sense.

In this case when x = o, mean when income change equal to zero since last quarter, the predicted value of y is .55. Even when x = o make no sense, the intercept is an important part of the model.

autoplot(uschange, facets = T)

uschange %>%
  as.data.frame() %>%
  ggpairs()

5.2 least squares estimation

tslm() function fit a linear regressional model to time series data, compare to lm model, tslm provides additional facilities for handling time series.

Example US consumption expenditure

fit_cos <- tslm(Consumption ~ Income + Production + Unemployment + Savings, data = uschange)

summary(fit_cos)
## 
## Call:
## tslm(formula = Consumption ~ Income + Production + Unemployment + 
##     Savings, data = uschange)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88296 -0.17638 -0.03679  0.15251  1.20553 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.26729    0.03721   7.184 1.68e-11 ***
## Income        0.71449    0.04219  16.934  < 2e-16 ***
## Production    0.04589    0.02588   1.773   0.0778 .  
## Unemployment -0.20477    0.10550  -1.941   0.0538 .  
## Savings      -0.04527    0.00278 -16.287  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3286 on 182 degrees of freedom
## Multiple R-squared:  0.754,  Adjusted R-squared:  0.7486 
## F-statistic: 139.5 on 4 and 182 DF,  p-value: < 2.2e-16

Fitted values

autoplot(uschange[, 'Consumption'], series = "Data") +
  autolayer(fitted(fit_cos), series = "Fitted") +
  labs(x = "Year", y = "", title = "Percent change in US consumption expenditure")

cbind(Data = uschange[, 'Consumption'],
      Fitted = fitted(fit_cos)) %>%
  as.data.frame() %>%
  ggplot(aes(Data, Fitted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  labs(x = "Fitted value",
       y = "Actual Value")

Goodness-of-fit

Standard error of the regression

\[\hat{\sigma_e} = \sqrt{\frac{1}{T-k-1}\sum_{t=1}^T{e_t}^2}\]

5.1 Evaluating the regression model

ACF plot of residuals

Histogram of residuals

checkresiduals(fit_cos)

## 
##  Breusch-Godfrey test for serial correlation of order up to 8
## 
## data:  Residuals from Linear regression model
## LM test = 14.874, df = 8, p-value = 0.06163
kpss.test(residuals(fit_cos))
## 
##  KPSS Test for Level Stationarity
## 
## data:  residuals(fit_cos)
## KPSS Level = 0.51799, Truncation lag parameter = 4, p-value =
## 0.03762

The time plot show some chagne variation over time, but is otherwise relatively unremarkable. This heteroscedascity will potentially make the predication interval coverage inaccurate.

THe histogram show that the residauls seem to be slightly skewed, which may also affect the coverage probability of the predication intervals.

The autocorrelation plot show a significant at the 5% level, but still not quite enought for the Breuch-Godfrey to be signiciant at the 5% level.

Residuals plot against predictors.

df <- as.data.frame(uschange)

df[, "Residuals"] <- as.numeric(residuals(fit_cos))

df %>%
  select(-Consumption) %>%
  gather(Income:Unemployment, key = "key", value = "value") %>%
  ggplot(aes(value, Residuals)) +
  geom_point() +
  facet_wrap( ~ key, scales = "free")

Residuals plot against fitted values

augment(fit_cos) %>%
  ggplot(aes(.fitted, .resid)) +
  geom_point() +
  labs(x = "Fitted", y = "Residual")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

outliers and influential observations

Outlier - observation taht take extreme values compare to the majority of the data are called outliers

Influential observation: Have a large influence on the estimated coefficients of a regressional model

Spurious regression

“non-stationary” the value of the time series do not flucturate around a constant mean or with a constant variance.

aussies <- window(ausair, end=2011)

fit <- tslm(aussies ~ guinearice)

summary(fit)
## 
## Call:
## tslm(formula = aussies ~ guinearice)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9448 -1.8917 -0.3272  1.8620 10.4210 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -7.493      1.203  -6.229 2.25e-07 ***
## guinearice    40.288      1.337  30.135  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.239 on 40 degrees of freedom
## Multiple R-squared:  0.9578, Adjusted R-squared:  0.9568 
## F-statistic: 908.1 on 1 and 40 DF,  p-value: < 2.2e-16
checkresiduals(fit)

## 
##  Breusch-Godfrey test for serial correlation of order up to 8
## 
## data:  Residuals from Linear regression model
## LM test = 28.813, df = 8, p-value = 0.000342

5.4 Some useful predictors

Trend

Dummary variables

Example: Australian Quarterly Beer Production

beer2 <- window(ausbeer, start = 1992)

autoplot(beer2) +
  labs(x = "Year", y = "Megalitres")

autoplot(snaive(beer2, drift = T)) +
  labs(x = "Year", y = "Megalitres")

We can see this time series plot exhibit trend and seasonality. We can model this data using a regression model with a linear trend and quarterly dummary variables.

\(y_t = \beta_0 + \beta_1t +\beta_2d_{2,t} + \beta_3d_{3,t} + \beta_4d_{4,t} + \epsilon_t\)

where \(d_{i,t} = 1\) if t is in i and o otherwise.

fit_beer <- tslm(beer2 ~ trend + season)

summary(fit_beer)
## 
## Call:
## tslm(formula = beer2 ~ trend + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.903  -7.599  -0.459   7.991  21.789 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 441.80044    3.73353 118.333  < 2e-16 ***
## trend        -0.34027    0.06657  -5.111 2.73e-06 ***
## season2     -34.65973    3.96832  -8.734 9.10e-13 ***
## season3     -17.82164    4.02249  -4.430 3.45e-05 ***
## season4      72.79641    4.02305  18.095  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.23 on 69 degrees of freedom
## Multiple R-squared:  0.9243, Adjusted R-squared:  0.9199 
## F-statistic: 210.7 on 4 and 69 DF,  p-value: < 2.2e-16
autoplot(beer2, series = "Data") +
  autolayer(fitted(fit_beer), series = "Fitted") +
  labs(x = "Year", y = "Megalitres",
       title = "Quarterly Beer Production")

augment(fit_beer) %>%
  ggplot(aes(beer2, .fitted, color = season)) +
  geom_point() +
  labs(x = "Actual Value",
       y = "Fitted",
       title = "Quarterly beer production") +
  geom_abline(slope = 1, intercept = 1) +
  scale_color_brewer(palette = "Dark2", name = "Quarter")

augment(fit_beer)
## # A tibble: 74 x 9
##    beer2 trend season .fitted  .resid   .hat .sigma    .cooksd .std.resid
##    <dbl> <int> <fct>    <dbl>   <dbl>  <dbl>  <dbl>      <dbl>      <dbl>
##  1   443     1 1         441.   1.54  0.0910   12.3 0.000349       0.132 
##  2   410     2 2         406.   3.54  0.0910   12.3 0.00185        0.304 
##  3   420     3 3         423.  -2.96  0.0898   12.3 0.00127       -0.254 
##  4   532     4 4         513.  18.8   0.0898   12.1 0.0510         1.61  
##  5   433     5 1         440.  -7.10  0.0830   12.3 0.00665       -0.606 
##  6   421     6 2         405.  15.9   0.0830   12.2 0.0334         1.36  
##  7   410     7 3         422. -11.6   0.0822   12.2 0.0176        -0.990 
##  8   512     8 4         512.   0.125 0.0822   12.3 0.00000205     0.0107
##  9   449     9 1         439.  10.3   0.0759   12.3 0.0125         0.873 
## 10   381    10 2         404. -22.7   0.0759   12.0 0.0614        -1.93  
## # … with 64 more rows

Intervention variables

Trading days

Distributed lags

Easter

5.5 Selecting Predictors

Not recommand:

  1. plot individual x and y to see if there exist some noticeable relationship and decide whether or not drop this x Because it is not alway possible to see relatioship from a scatterplot.

  2. do a multiple linear regression on all the x and disregard all variables who p-value are greater than .05. To start with, statistical significance does not alway indicate predcitive value.

Instead, we will use a measure of predictive accuracy. in CV function

CV(fit_cos)
##           CV          AIC         AICc          BIC        AdjR2 
##    0.1163477 -409.2980298 -408.8313631 -389.9113781    0.7485856

Adjusted \(R^2\)

\(SSE = \sum_{t=1}^{T}{e_t}^2\)

adjusted SSE

\(\bar{R}^2 = 1 - (1- R^2)\frac{T-1}{T-k-1}\)

Cross validation

AIC

AICc

BIC

recommand use AICc, AIC or CV

5.6 Forecasting with regression

Ex-ante VS ex-post forecasts

Ex-ante forecast are those that are mde using only the information that is available in advance. These are genuine forecasts, made in advance using whatever information is avaibale at the time.

Ex-post forecast useing later information on the predictors. ex-post forecast of consumption may use the actual observations of the predictors. These are not genuine forecast, but are useful for studying the behaviour of forecasting models.

Example: Australian quarterly beer production

fit_beer <- tslm(beer2 ~ trend + season)

forecast(fit_beer) %>%
  autoplot() +
  labs(x = "Year", y = "Megalitres", title = "Forecast of beer production usign regression")

Scenaaio based forecasting

fit.consBest <- tslm(
  Consumption ~ Income + Savings + Unemployment,
  data = uschange)
h <- 4
newdata <- data.frame(
    Income = c(1, 1, 1, 1),
    Savings = c(0.5, 0.5, 0.5, 0.5),
    Unemployment = c(0, 0, 0, 0))
fcast.up <- forecast(fit.consBest, newdata = newdata)
newdata <- data.frame(
    Income = rep(-1, h),
    Savings = rep(-0.5, h),
    Unemployment = rep(0, h))
fcast.down <- forecast(fit.consBest, newdata = newdata)
autoplot(uschange[, 1]) +
  ylab("% change in US consumption") +
  autolayer(fcast.up, PI = TRUE, series = "increase") +
  autolayer(fcast.down, PI = TRUE, series = "decrease") +
  guides(colour = guide_legend(title = "Scenario"))

Building a predictive regression model

Prediction intervals

Example

fit_cons <- tslm(Consumption ~ Income, data = uschange)

fcast_avg <- forecast(fit_cons, newdata = data.frame(Income = rep(mean(uschange[, "Income"]), 4)))

fcast_up <- forecast(fit_cons, newdata = data.frame(Income = rep(5, 4)))

autoplot(uschange[, "Income"]) +
  autolayer(fcast_avg, series = "Average") +
  autolayer(fcast_up, series = "Upper") +
  labs(colour = guide_legend(title = "Scenario"))

5.7 Matrix Formulation

5.8 Nonlinear regression

Forecasting with a nonlinear trend

h <- 10
fit_lin <- tslm(marathon ~ trend)

fcast_line <- forecast(fit_lin, h = h)

fit_exp <- tslm(marathon ~ trend, lambda = 0)
fcast_exp <- forecast(fit_exp, h = h)

t <- time(marathon)
t.break1 <- 1940
t.break2 <- 1980
tb1 <- ts(pmax(0, t - t.break1), start = 1897)
tb2 <- ts(pmax(0, t - t.break2), start = 1897)


fit.pw <- tslm(marathon ~ t + tb1 + tb2)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] + seq(h)
tb2.new <- tb2[length(tb2)] + seq(h)


newdata <- cbind(t=t.new, tb1=tb1.new, tb2=tb2.new) %>%
  as.data.frame()
fcasts.pw <- forecast(fit.pw, newdata = newdata)

fit.spline <- tslm(marathon ~ t + I(t^2) + I(t^3) +
  I(tb1^3) + I(tb2^3))
fcasts.spl <- forecast(fit.spline, newdata = newdata)

autoplot(marathon) +
  autolayer(fitted(fit_lin), series = "Linear") +
  autolayer(fitted(fit_exp), series = "Exponential") +
  autolayer(fitted(fit.pw), series = "Piecewise") +
  autolayer(fitted(fit.spline), series = "Cubic Spline") +
  autolayer(fcasts.pw, series="Piecewise") +
  autolayer(fcast_line, series="Linear", PI=FALSE) +
  autolayer(fcast_exp, series="Exponential", PI=FALSE) +
  autolayer(fcasts.spl, series="Cubic Spline", PI=FALSE) +
  labs(y = "Year", x= "Winning times in minutes")

marathon %>%
  splinef(lambda = 0) %>%
  autoplot()

marathon %>%
  splinef(lambda = 0) %>%
  checkresiduals()
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

5.9 Correlation, causation and forecasting

Correlation is not causation

It’s important to understand that correlations are useful for forecasting, even when there is no causal relationship between the two variables.

Confounded predictors

We say that two variables are confounded when their effects on the forecast variable cannot be separated.

Confounding is not really a problem for forecasting, as we can still compute forecasts without needing to separate out then effects of the predictors. It will become a problem with scenario forecasting as the scenarios should take accountof the relationships between predcitros.

Multicollinearity and forecasting

When multicollinearity is present, the uncertainity associtated with individual regression coefficeitns will be large.

Forecasts will be unrealiable if the values of the future predictors are outside the range of the historical values of the predictors.

5.10 Exercises

daily20 <- head(elecdaily, 20)

autoplot(daily20[, c("Demand", "Temperature")])

# fit lm model

eclec_fit <- tslm(Demand ~ Temperature, data = daily20)

summary(eclec_fit)
## 
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.060  -7.117  -1.437  17.484  27.102 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.2117    17.9915   2.179   0.0428 *  
## Temperature   6.7572     0.6114  11.052 1.88e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22 on 18 degrees of freedom
## Multiple R-squared:  0.8716, Adjusted R-squared:  0.8644 
## F-statistic: 122.1 on 1 and 18 DF,  p-value: 1.876e-09

There exist a positive relationship between temperature and elec demand, because people use more elec during hot temperature.

checkresiduals(eclec_fit)

## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774

There time plot show some chaing variation over time, will potentially make the prediction interval inaccurate. And rresiduals are seem to be skewed, which may affect the coverage probability of the prediction intervals.

daily20 %>%
  as.data.frame() %>%
  ggplot(aes(Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = F)

daily20 %>%
  as.data.frame() %>%
  ggplot(aes(Temperature)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

I probabily will not believe the prediction because this range exceed my training dataset boundaries.

elec_predict <- forecast(eclec_fit, newdata = data.frame(Temperature = seq(15, 35, by = 1)))

elecdaily %>%
  as.data.frame() %>%
  ggplot(aes(Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = F)

The model I have only have data avaibale from 20 degree to 40 degree and data outside this range have different pattern than data inside the range.

Winning times in Olympic men’s 400m track final.

mens400_df <- mens400 %>%
  tbl_df() %>%
  mutate(year = row_number() * 4 + 1892)


mens400_df %>%
  ggplot(aes(year, x)) +
  geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Time for finish 400 m track final reduce drastically from 1896 to 1920. it is a downward slop.

fit a lm model to see teh wining times decrease at what speed

times_fit <- tslm(x ~ year, mens400_df)


times_fit
## 
## Call:
## tslm(formula = x ~ year, data = mens400_df)
## 
## Coefficients:
## (Intercept)         year  
##   172.48148     -0.06457

Finishing time decrease at around 0.06 per year

times_fit %>%
  augment() %>%
  ggplot(aes(year, .resid)) +
  geom_line()
## Warning in augment_columns(x, data, newdata, type.predict = type.predict, :
## When fitting with na.exclude, rows with NA in original data will be dropped
## unless those rows are provided in 'data' argument

This indicated the prediction interval of this line is not stable.

mens400_ts <- ts(mens400_df[, c("x")], start = 1896, frequency = 1/4)

times_fit <- tslm(x ~ trend, data = mens400_ts)

autoplot(mens400_ts)+
  autolayer(forecast(times_fit, h = 2)) +
  labs(x = "Year", y = "Seconds")

forecast(times_fit, h = 2)
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2020       42.04231 40.44975 43.63487 39.55286 44.53176
## 2024       41.78402 40.18198 43.38605 39.27976 44.28827

The assumption I made in the calculation is that in foreseeable future this lienar trend will sustain.

head(easter(ausbeer), 20)
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00

some times easter holiday only occur in one quarter, some times it exist in more than one quarter.

autoplot(fancy)

# we need to adjust dailyaverage since different months have different days

fancy_df <- cbind(Monthly = fancy,
                  Dailyaverage = fancy / monthdays(fancy))

autoplot(fancy_df, facet = T) +
  labs(x = "year", y = "Monthly sales") +
  scale_y_continuous(labels = dollar_format())

During 92-94, we see a much large growth than previous years growth rate.

if we want to fit a liear model it will be unreasonable becuase the data show variation that increase with teh level of the series. So transformation is reasonable.

BoxCox transformation \(y_t = (\lambda*w_t + 1)^{1/\lambda}\), \(y_t\) is original data, \(w_t\) is transformed data.

lambda <- BoxCox.lambda(fancy)

fancy_trans <- BoxCox(fancy, lambda = lambda)

autoplot(fancy_trans)

fancy_lm <- tslm(fancy_trans ~ trend + season)

augment(fancy_lm) %>%
  ggplot(aes(.fitted, .resid)) +
  geom_point()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

checkresiduals(fancy_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals from Linear regression model
## LM test = 37.186, df = 16, p-value = 0.001974

The plots do not reveal any problems with the model

fit_resid <- data.frame(resids = residuals(fancy_lm),
                        month = rep(1:12, 7)) 

fit_resid %>%
  ggplot(aes(factor(month), resids)) +
  geom_boxplot()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

No it does not reveal any problem

tidy(fancy_lm)
## # A tibble: 13 x 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)   7.67    0.0784       97.7  1.93e-77
##  2 trend         0.0228  0.000862     26.5  1.52e-38
##  3 season2       0.255   0.101         2.52 1.39e- 2
##  4 season3       0.708   0.101         6.99 1.23e- 9
##  5 season4       0.390   0.101         3.85 2.59e- 4
##  6 season5       0.415   0.101         4.10 1.10e- 4
##  7 season6       0.455   0.101         4.49 2.72e- 5
##  8 season7       0.620   0.101         6.11 4.87e- 8
##  9 season8       0.596   0.102         5.88 1.25e- 7
## 10 season9       0.679   0.102         6.68 4.42e- 9
## 11 season10      0.758   0.102         7.46 1.67e-10
## 12 season11      1.23    0.102        12.1  7.44e-19
## 13 season12      2.00    0.102        19.6  2.08e-30
checkresiduals(fancy_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals from Linear regression model
## LM test = 37.186, df = 16, p-value = 0.001974
ggsubseriesplot(fancy_lm$residuals)

The autocorrelation plot show a significant spike at lag 9, 24, but it is not quite enough for the Breush-Godfrey to be significant at the 5% level. In any case the autocorrelationship is not particularly large, and at lag 9 and 24 it is unlikely to have any noticeable impact on the forecasts or prediction intervals.

fancy_predict <- forecast(fancy_lm, h = 36)

autoplot(fancy_predict)

  1. Fit a harmonic regression with trend to the data. Experiment with changing the number Fourier terms. Plot the observed gasoline and fitted values and comment on what you see.
gasoline_2014 <- window(gasoline, end = 2005)

autoplot(gasoline_2014)

for(num in c(1, 2, 3, 5, 10, 20)) {
 var_name <- paste("tslm_fit_",
                   num,
                   "_gasoline_2004",
                   sep = "")
 plot_name <- paste("plot",
                   num,
                   sep = "_")
 
 assign(var_name,
        tslm(gasoline_2014 ~ trend + fourier(
          gasoline_2014,
          K = num
        )))
 
 assign(plot_name, autoplot(gasoline_2014) +
         autolayer(get(var_name)$fitted.values,
                   series = as.character(num)) +
         labs(title = var_name,
              y = "gasoline",
              subtitle = paste("fourier #", num)) +
         theme(legend.position = "none"))
}


grid.arrange(plot_1, plot_2, plot_3, plot_5,plot_10, plot_20, ncol = 2)

  1. Select the appropriate number of Fourier terms to include by minimising the AICc or CV value.
p <- list(tslm_fit_1_gasoline_2004, tslm_fit_2_gasoline_2004, tslm_fit_3_gasoline_2004,
          tslm_fit_5_gasoline_2004, tslm_fit_10_gasoline_2004, tslm_fit_20_gasoline_2004)

cv_result <- lapply(p, CV) %>%
  as.data.frame()

names(cv_result) <- paste0("CV", c(1,2,3,5,10,20), sep = "")
cv_result <- add_column(cv_result, names = rownames(cv_result))

cv_result %>%
  filter(names %in% c("AIC", "CV")) %>%
  gather(CV1:CV20, key = "key", value = "value")
##    names  key         value
## 1     CV  CV1  8.201921e-02
## 2    AIC  CV1 -1.813292e+03
## 3     CV  CV2  8.136854e-02
## 4    AIC  CV2 -1.819113e+03
## 5     CV  CV3  7.658846e-02
## 6    AIC  CV3 -1.863085e+03
## 7     CV  CV5  7.553646e-02
## 8    AIC  CV5 -1.873234e+03
## 9     CV CV10  7.135754e-02
## 10   AIC CV10 -1.915014e+03
## 11    CV CV20  7.223834e-02
## 12   AIC CV20 -1.908017e+03
tslm_ft7_gasoline_2004 <- tslm(
  gasoline_2014 ~ trend + fourier(
    gasoline_2014, 
    K = 7))

checkresiduals(tslm_ft7_gasoline_2004)

## 
##  Breusch-Godfrey test for serial correlation of order up to 104
## 
## data:  Residuals from Linear regression model
## LM test = 149.31, df = 104, p-value = 0.002404
fc_gasoline_2005 <- forecast(tslm_ft7_gasoline_2004, newdata = data.frame(fourier(gasoline_2014, K = 7, h = 52)))

autoplot(fc_gasoline_2005) +
  autolayer(window(gasoline, start = 2005, end = 2006) , series = "Forecast")

Almost all the actual data fall into 80% prediction interval, but the prediction couldn’t predict the sudden dip in the fall.

autoplot(huron)

The chart exhibit a downward trend from 1875 - 1930. But after that the trned disappear.

  1. Fit a linear regression and compare this to a piecewise linear trend model with a knot at 1915.
# simple linear trend
h = 10

tslm_huron <- tslm(huron ~ trend)
fc_tslm <- forecast(tslm_huron, h = h)

# power transformation

lambda <- BoxCox.lambda(huron)
huron_box <- BoxCox(huron, lambda)

tslm_log_huron <- tslm(huron_box ~ trend)
fc_box_tslm <- forecast(tslm_log_huron, h = h)

# piecewise linear trend

t <- time(huron)
t.break <- 1915
t_piece <- ts(pmax(0,t-t.break), start=1875)

pw_huron <- tslm(huron ~ t + t_piece)
t_new <- t[length(t)]+seq(h)
t_piece_new <- t_piece[length(t_piece)]+seq(h)
  
newdata <- cbind(t=t_new,
                 t_piece=t_piece_new) %>%
  as.data.frame()

fc_pw_huron <- forecast(
  pw_huron,
  newdata = newdata
  )

# cubin spline 
spline_huron <- tslm(huron ~ t + I(t^2) + I(t^3) +I(t_piece^2) + I(t_piece^3))

fc_spline_huron <- forecast(
  spline_huron,
  newdata = newdata
  )

autoplot(huron) +
  autolayer(fitted(fc_tslm), series = "Linear") +
  autolayer(fitted(fc_box_tslm), series = "Expontial") +
  autolayer(fitted(fc_pw_huron), series = "PieceWise") +
  autolayer(fitted(fc_spline_huron), series = "Cubic Spline") +
  autolayer(fc_tslm, series = "Linear", PI = F) +
  autolayer(fc_box_tslm, series = "Expontial", PI = F) +
  autolayer(fc_pw_huron, series = "PieceWise", PI = F) +
  autolayer(fc_spline_huron, series = "Cubic Spline", PI = F)

  1. Generate forecasts from these two models for the period up to 1980 and comment on these.
p1 <- autoplot(huron) +
  autolayer(fitted(tslm_huron), series = "LM") +
  autolayer(fc_tslm, series = "linear") +
  labs(title = "LM model") +
  theme(legend.position = "none")

p2 <- autoplot(huron) +
  autolayer(fitted(pw_huron), series = "PieceWise") +
  autolayer(fc_pw_huron, series = "PieceWise") +
  labs(title = "PieceWise model") +
  theme(legend.position = "none")

grid.arrange(p1, p2, ncol = 2)

Linear regression trend does not capture the trend change around 1915. PieceWise regression capture the trend change around 1915